{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install boto3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_train = 'feature-store/amazon-reviews/csv/balanced-raw-with-header/train'\n",
    "\n",
    "balanced_raw_with_header_train_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_train)\n",
    "\n",
    "print(balanced_raw_with_header_train_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_output = 'models/amazon-reviews/autopilot'\n",
    "\n",
    "autopilot_model_output_s3_uri = 's3://{}/{}'.format(bucket, prefix_output)\n",
    "\n",
    "print(autopilot_model_output_s3_uri)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Launch SageMaker Studio from the AWS Console"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Launch SageMaker Studio](img/sagemaker-console-home.png)\n",
    "\n",
    "![Launch SageMaker Studio](img/sagemaker-studio-console-home.png)\n",
    "\n",
    "![Launch SageMaker Studio](img/sagemaker-studio-console-open-studio.png)\n",
    "\n",
    "![Launch SageMaker Studio](img/sagemaker-studio-home.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train a Model with 1-Click!\n",
    "\n",
    "Experiment Name:  `amazon-reviews-sentiment`\n",
    "\n",
    "S3 Location of Input Data:  `s3://<bucket-name-from-above>/feature-store/amazon-reviews/csv/balanced-raw-with-header/data.csv`\n",
    "\n",
    "Target Attribute Name:  `is_positive_sentiment`\n",
    "\n",
    "S3 Location of Output Data:  `s3://<bucket-name-from-above>/output`\n",
    "\n",
    "![Train Model with 1-Click](img/sagemaker-studio-create-experiment.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tracking the progress of the AutoPilot job\n",
    "SageMaker AutoPilot job consists of four high-level steps : \n",
    "* Data Preprocessing, where the dataset is split into train and validation sets.\n",
    "* Recommending Pipelines, where the dataset is analyzed and SageMaker AutoPilot comes up with a list of ML pipelines that should be tried out on the dataset.\n",
    "* Automatic Feature Engineering, where SageMaker AutoPilot performs feature transformation on individual features of the dataset as well as at an aggregate level.\n",
    "* ML pipeline selection and hyperparameter tuning, where the top performing pipeline is selected along with the optimal hyperparameters for the training algorithm (the last stage of the pipeline). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sleep for a bit to ensure the AutoML job above has time to start\n",
    "import time\n",
    "time.sleep(3)\n",
    "\n",
    "job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
    "job_status = job['AutoMLJobStatus']\n",
    "job_sec_status = job['AutoMLJobSecondaryStatus']\n",
    "\n",
    "if job_status not in ('Stopped', 'Failed'):\n",
    "    while job_status in ('InProgress') and job_sec_status in ('AnalyzingData'):\n",
    "        job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
    "        job_status = job['AutoMLJobStatus']\n",
    "        job_sec_status = job['AutoMLJobSecondaryStatus']\n",
    "        print(job_status, job_sec_status)\n",
    "        sleep(30)\n",
    "    print(\"Data analysis complete\")\n",
    "    \n",
    "print(job)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
    "job_status = job['AutoMLJobStatus']\n",
    "job_sec_status = job['AutoMLJobSecondaryStatus']\n",
    "print(job_status)\n",
    "print(job_sec_status)\n",
    "if job_status not in ('Stopped', 'Failed'):\n",
    "    while job_status in ('InProgress') and job_sec_status in ('FeatureEngineering'):\n",
    "        job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
    "        job_status = job['AutoMLJobStatus']\n",
    "        job_sec_status = job['AutoMLJobSecondaryStatus']\n",
    "        print(job_status, job_sec_status)\n",
    "        sleep(30)\n",
    "    print(\"Feature engineering complete\")\n",
    "    \n",
    "print(job)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
    "job_status = job['AutoMLJobStatus']\n",
    "job_sec_status = job['AutoMLJobSecondaryStatus']\n",
    "print(job_status)\n",
    "print(job_sec_status)\n",
    "if job_status not in ('Stopped', 'Failed'):\n",
    "    while job_status in ('InProgress') and job_sec_status in ('ModelTuning'):\n",
    "        job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
    "        job_status = job['AutoMLJobStatus']\n",
    "        job_sec_status = job['AutoMLJobSecondaryStatus']\n",
    "        print(job_status, job_sec_status)\n",
    "        sleep(30)\n",
    "    print(\"Model tuning complete\")\n",
    "    \n",
    "print(job)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Viewing all candidates explored by SageMaker AutoPilot\n",
    "Once model tuning is complete, you can view all the candidates (pipeline evaluations with different hyperparameter combinations) that were explored by AutoML and sort them by their final performance metric."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, \n",
    "                                                SortBy='FinalObjectiveMetricValue')['Candidates']\n",
    "for index, candidate in enumerate(candidates):\n",
    "    print(str(index) + \"  \" \n",
    "        + candidate['CandidateName'] + \"  \" \n",
    "        + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inspect SageMaker AutoPilot trials with Amazon SageMaker Experiments\n",
    "SageMaker AutoPilot automatically creates a new experiment, and pushes information for each trial. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.analytics import ExperimentAnalytics, TrainingJobAnalytics\n",
    "\n",
    "exp = ExperimentAnalytics(\n",
    "    sagemaker_session=sess, \n",
    "    experiment_name=auto_ml_job_name + '-aws-auto-ml-job',\n",
    ")\n",
    "\n",
    "df = exp.dataframe()\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Viewing notebooks generated by SageMaker AutoPilot\n",
    "Once data analysis is complete, SageMaker AutoPilot generates two notebooks: \n",
    "* Data exploration,\n",
    "* Candidate definition."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
    "print(job)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's copy all of the generated resources including the two notebooks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "generated_resources = job['AutoMLJobArtifacts']['DataExplorationNotebookLocation'].rstrip('notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb')\n",
    "generated_resources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf ./generated_module\n",
    "!rm -rf ./notebooks\n",
    "!aws s3 cp --recursive $generated_resources ."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### In the file view, open the `notebooks/` and `generated_module/` folders.  Lots of useful information in there!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deploying the best candidate\n",
    "Now that we have successfully completed the AutoML job on our dataset and visualized the trials, we can create a model from any of the trials with a single API call and then deploy that model for online or batch prediction using [Inference Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipelines.html). For this notebook, we deploy only the best performing trial for inference."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The best candidate is the one we're really interested in."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']\n",
    "best_candidate_identifier = best_candidate['CandidateName']\n",
    "\n",
    "print(\"Candidate name: \" + best_candidate_identifier)\n",
    "print(\"Metric name: \" + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])\n",
    "print(\"Metric value: \" + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_candidate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see the containers and models composing the Inference Pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for container in best_candidate['InferenceContainers']:\n",
    "    print(container['Image'])\n",
    "    print(container['ModelDataUrl'])\n",
    "    print('======================')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = 'automl-dm-model-' + timestamp_suffix\n",
    "\n",
    "model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],\n",
    "                            ModelName=model_name,\n",
    "                            ExecutionRoleArn=role)\n",
    "\n",
    "print('Best candidate model ARN: ', model_arn['ModelArn'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's deploy the pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# EndpointConfig name\n",
    "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n",
    "epc_name = 'automl-dm-epc-' + timestamp_suffix\n",
    "\n",
    "# Endpoint name\n",
    "xgb_endpoint_name = 'automl-dm-ep-' + timestamp_suffix\n",
    "variant_name = 'automl-dm-variant-' + timestamp_suffix\n",
    "\n",
    "print(xgb_endpoint_name)\n",
    "print(variant_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ep_config = sm.create_endpoint_config(EndpointConfigName = xgb_endpoint_name,\n",
    "                                      ProductionVariants=[{'InstanceType':'ml.m4.xlarge',\n",
    "                                                           'InitialInstanceCount':1,\n",
    "                                                           'ModelName':model_name,\n",
    "                                                           'VariantName':variant_name}])\n",
    "\n",
    "create_endpoint_response = sm.create_endpoint(EndpointName=xgb_endpoint_name,\n",
    "                                              EndpointConfigName=epc_name)\n",
    "print(create_endpoint_response['EndpointArn'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "sm.get_waiter('endpoint_in_service').wait(EndpointName=xgb_endpoint_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resp = sm.describe_endpoint(EndpointName=xgb_endpoint_name)\n",
    "status = resp['EndpointStatus']\n",
    "\n",
    "print(\"Arn: \" + resp['EndpointArn'])\n",
    "print(\"Status: \" + status)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scoring the Best Candidate\n",
    "Let's predict and score the validation set. We'll compute metrics ourselves just for fun."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm_runtime = boto3.client('sagemaker-runtime')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ep_name = ''\n",
    "\n",
    "csv_line_predict_positive = \"\"\"I loved it!  I wish there was a new season...\"\"\"\n",
    "response = sm_runtime.invoke_endpoint(EndpointName=xgb_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_positive)\n",
    "\n",
    "response_body = response['Body'].read().decode(\"utf-8\").strip()\n",
    "response_body"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_line_predict_negative = \"\"\"This isn't good.  Complete waste of time.\"\"\"\n",
    "response = sm_runtime.invoke_endpoint(EndpointName=xgb_endpoint_name, ContentType='text/csv', Accept='text/csv', Body=csv_line_predict_negative)\n",
    "\n",
    "response_body = response['Body'].read().decode(\"utf-8\").strip()\n",
    "response_body"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate Test Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_test = 'feature-store/amazon-reviews/csv/balanced-raw-with-header/test'\n",
    "\n",
    "balanced_raw_with_header_test_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_test)\n",
    "\n",
    "balanced_raw_with_header_test_path = './{}/data.csv'.format(prefix_test)\n",
    "\n",
    "print(balanced_raw_with_header_test_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 cp $balanced_raw_with_header_test_s3_uri $balanced_raw_with_header_test_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset(path, sep, header):\n",
    "    data = pd.read_csv(path, sep=sep, header=header)\n",
    "\n",
    "    labels = data.iloc[:,0]\n",
    "    features = data.drop(data.columns[0], axis=1)\n",
    "    \n",
    "    if header==None:\n",
    "        # Adjust the column names after dropped the 0th column above\n",
    "        # New column names are 0 (inclusive) to len(features.columns) (exclusive)\n",
    "        new_column_names = list(range(0, len(features.columns)))\n",
    "        features.columns = new_column_names\n",
    "\n",
    "    return features, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test, y_test = load_dataset(path=balanced_raw_with_header_test_path, sep=',', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload_500_samples = X_test[:500].to_csv(index=False, header=False).rstrip()\n",
    "\n",
    "response_500_samples = sm_runtime.invoke_endpoint(EndpointName=xgb_endpoint_name, \n",
    "                                                  Body=payload_500_samples.encode('utf-8'),\n",
    "                                                  ContentType='text/csv')['Body'].read()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_500_samples = np.fromstring(response_500_samples, sep=',')\n",
    "predictions_500_samples_0_or_1 = np.where(predictions_500_samples > 0.5, 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Test Accuracy: ', accuracy_score(y_test[:500], predictions_500_samples_0_or_1))\n",
    "print('Test Precision: ', precision_score(y_test[:500], predictions_500_samples_0_or_1, average=None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_cm_test = confusion_matrix(y_test[:500], predictions_500_samples_0_or_1)\n",
    "df_cm_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_cm_test = confusion_matrix(y_test[:500], predictions_500_samples_0_or_1)\n",
    "df_cm_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "auc = round(metrics.roc_auc_score(y_test, preds_test), 4)\n",
    "print('AUC is ' + repr(auc))\n",
    "\n",
    "fpr, tpr, _ = metrics.roc_curve(y_test, preds_test)\n",
    "\n",
    "plt.title('ROC Curve')\n",
    "plt.plot(fpr, tpr, 'b',\n",
    "label='AUC = %0.2f'% auc)\n",
    "plt.legend(loc='lower right')\n",
    "plt.plot([0,1],[0,1],'r--')\n",
    "plt.xlim([-0.1,1.1])\n",
    "plt.ylim([-0.1,1.1])\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
